This notebook details the various analyses and code implementations used in the paper, “Quantifying and correcting slide-to-slide variation in multiplexed immunofluorescence images.” Please see here for more information on the paper and for complete descriptions of the analyses and concepts therein. The analyses here rely also on the following scripts, available in the Github repository, for further functionality:
all_combat_functions_210709.R: code to run the ComBat algorithmall_fda_functions_210709.R: code to run the functional data registration algorithmall_otsu_functions_210709.R: code to run collect Otsu threshold and metricsknitr::opts_chunk$set(echo = TRUE,warning = FALSE,fig.width = 10,message = FALSE)
## tidyverse packages
require(dplyr)
require(tidyr)
## visualization packages
require(ggplot2)
require(ggpubr)
require(ggforce)
require(viridis)
## methods packages
require(fda)
require(umap)
require(lme4)
require(emmeans)
require(uwot)
require(reticulate)
## set useful variables for later on
x_axis_text_size = 8
vals = rev(inferno(43))
## load data from https://github.com/statimagcoll/atlasAnalysis
sc = atlasAnalysis::loadQuantifiedPCA()
## remove other cols that aren't median cell intensities
to_remove = grep("Median_Cyt|Median_Nuc|Median_Mem",colnames(sc))
sc = sc[,-to_remove]
## select markers for analysis
all_vars = c("Median_Cell_CD3D","Median_Cell_CD8","Median_Cell_COLLAGEN",
"Median_Cell_VIMENTIN","Median_Cell_PANCK","Median_Cell_NAKATPASE",
"Median_Cell_SOX9","Median_Cell_BCATENIN","Median_Cell_OLFM4")
## -- set all methods
methods = c("log10","mean_division","mean_division_log10","cube_root")
methods = c(methods,paste0(methods[1:4],"_fda_registered_x1"))
methods = c(methods,paste0(methods[1:4],"_ComBat_incl_zeroes"))
## -- !end set all methods
Run the scale transformations on each variable as selected above. The transformations included are as follows:
## --- --- ---
## --- log10 + 1
## --- --- ---
for(v in all_vars){
sc[,paste0(v,"_log10")] = log10(sc[,v]+1)
}
## --- --- ---
## --- cube root
## --- --- ---
for(v in all_vars){
sc[,paste0(v,"_cube_root")] = sc[,v]^(1/3)
}
## --- --- ---
## --- mean division, log mean division
## --- --- ---
for(v in all_vars){
slides = split(sc,sc$SlideID)
joe_l = list()
for(i in 1:length(slides)){
s = data.frame(slides[[i]])
s[,paste0(v,"_mean_division")] = s[,v] / mean(s[,v],na.rm=TRUE)
joe_l[[i]] = s
}
sc = data.frame(data.table::rbindlist(joe_l))
sc[,paste0(v,"_mean_division_log10")] = log10(sc[,paste0(v,"_mean_division")]+0.5)
sc[,paste0(v,"_mean_division")] = sc[,paste0(v,"_mean_division")] + -min(sc[,paste0(v,"_mean_division")])
sc[,paste0(v,"_mean_division_log10")] = sc[,paste0(v,"_mean_division_log10")] + -min(sc[,paste0(v,"_mean_division_log10")])
}
## --- --- ---
## --- rename raw values for later analysis
## --- --- ---
for(v in all_vars){
sc[,paste0(v,"_raw")] = sc[,v]
}
## --- --- ---
## --- fda
## --- --- ---
source("all_fda_functions_210709.R")
## loop over all variables and methods to run the registration
for(j in all_vars){
for(i in methods[1:4]){
## update variable names
normedVar = paste0(j,"_",i,"_fda_registered_x")
var = paste0(j,"_",i)
## run 1 iteration of registration
for(k in 1){
sc = register_var(var = var,
normedVar = paste0(normedVar,k),
normedVariter = k,
cb_atl = sc)
var = paste0(normedVar,k)
}
}
}
## --- --- ---
## --- ComBat
## --- --- ---
source("all_combat_functions_210709.R")
## loop over all variables and methods to run ComBat
for(j in all_vars){
for(m in methods[1:4]){
## update variable name
scale_channel = paste0(j,"_",m)
## run ComBat algorithm on given method and marker channel
sc[,paste0(scale_channel,"_ComBat_incl_zeroes")] = adjust_vals(channel = scale_channel,slide_var="SlideID",chan=sc,remove_zeroes = FALSE)
}
}
## --- --- ---
## --- Otsu thresholds
## --- --- ---
source("all_otsu_functions_210709.R")
## list for adjusting variable names
adjList = list("log10" ="Unnormalized: log10",
"mean_division" ="Unnormalized: Mean division",
"mean_division_log10" ="Unnormalized: Mean division log10",
"cube_root" ="Unnormalized: Cube root",
"log10_fda_registered_x1" ="fda: log10",
"mean_division_fda_registered_x1" ="fda: Mean division",
"mean_division_log10_fda_registered_x1" ="fda: Mean division log10",
"cube_root_fda_registered_x1" ="fda: Cube root",
"log10_ComBat_incl_zeroes" ="ComBat: log10",
"mean_division_ComBat_incl_zeroes" ="ComBat: Mean division",
"mean_division_log10_ComBat_incl_zeroes" ="ComBat: Mean division log10",
"cube_root_ComBat_incl_zeroes" ="ComBat: Cube root")
## --- generate dataset of Otsu thresholds
otsus = generate_otsu_data(sc,all_vars = all_vars,c(methods,"raw"))
## --- run misclassification metrics on Otsus
all_means = run_misclassification(sc,otsus,c(methods,"raw"))
htan1 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_log10,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=log10_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("log10"),limits = c(0,5)) +
scale_y_continuous(name="Density",limits=c(0,2)) +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan2 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_log10_ComBat_incl_zeroes,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=log10_ComBat_incl_zeroes_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("log10 (ComBat)")) +
scale_y_continuous(name="Density") +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan3 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_log10_fda_registered_x1,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=log10_fda_registered_x1_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("log10 (fda)"),limits = c(0,5)) +
scale_y_continuous(name="Density",limits=c(0,2)) +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan4 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_cube_root,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=cube_root_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Cube root"),limits=c(0,30)) +
scale_y_continuous(name="Density") +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan5 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_cube_root_ComBat_incl_zeroes,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=cube_root_ComBat_incl_zeroes_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Cube root (ComBat)"),limits=c(-25,25)) +
scale_y_continuous(name="Density") +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan6 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_cube_root_fda_registered_x1,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=cube_root_fda_registered_x1_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Cube root (fda) "),limits=c(0,20)) +
scale_y_continuous(name="Density") +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan7 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_mean_division,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=mean_division_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Mean division"),limits=c(0,5)) +
scale_y_continuous(name="Density",limits=c(0,3)) +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan8 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_mean_division_ComBat_incl_zeroes,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=mean_division_ComBat_incl_zeroes_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Mean division (ComBat)"),limits=c(0,2)) +
scale_y_continuous(name="Density") +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan9 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_mean_division_fda_registered_x1,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=mean_division_fda_registered_x1_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Mean division"),limits=c(0,5)) +
scale_y_continuous(name="Density",limits=c(0,3)) +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan10 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_mean_division_log10,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=mean_division_log10_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Mean division log10"),limits=c(0,2)) +
scale_y_continuous(name="Density",limits=c(0,5)) +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan11 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_mean_division_log10_ComBat_incl_zeroes,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=mean_division_log10_ComBat_incl_zeroes_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Mean division log10 (ComBat)")) +
scale_y_continuous(name="Density") +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
htan12 = ggplot(sc) +
geom_density(aes(x=Median_Cell_VIMENTIN_mean_division_log10_fda_registered_x1,color=SlideID)) +
geom_rug(data=otsus[otsus$channel=="Median_Cell_VIMENTIN",],aes(x=mean_division_log10_fda_registered_x1_threshold,color=SlideID)) +
theme(legend.position = "None") +
scale_x_continuous(name= paste0("Mean division log10 (fda)")) +
scale_y_continuous(name="Density") +
scale_color_manual(values = vals) +
theme(axis.title.x = element_text(size=x_axis_text_size))
## fig1 plot
ggpubr::ggarrange(htan1,htan2,htan3,
htan4,htan5,htan6,
htan7,htan8,htan9,
htan10,htan11,htan12,nrow=4,ncol =3)
## --- calculate as ratios to raw data
all_ratios=all_means
all_ratios[,2:ncol(all_ratios)] = round(matrix(all_means[,2:ncol(all_ratios)]) - as.vector(data.frame(all_means[13,2:ncol(all_ratios)])),3)
not_rows = c("raw")
plot_values = all_ratios[!(all_ratios$method %in% not_rows),]
## change names for cleaner plotting
method_values = c("Unnormalized: Cube root",
"ComBat: Cube root",
"fda: Cube root",
"Unnormalized: log10",
"ComBat: log10",
"fda: log10",
"Unnormalized: Mean division",
"ComBat: Mean division",
"fda: Mean division",
"Unnormalized: Mean division log10",
"ComBat: Mean division log10",
"fda: Mean division log10")
plot_values$method = method_values
## calculate mean misclassification
mean_values = data.frame(rowMeans(plot_values[,-c(1)]))
mean_values$method = method_values
colnames(mean_values)[1] = c("means")
## plotting variables
size_val = 3
alpha_val = .8
## fig2a plot
ggplot(plot_values) +
geom_vline(xintercept=0)+
geom_point(aes(x=Median_Cell_CD3D,y=method,color="CD3"),size=size_val,alpha=alpha_val)+
geom_point(aes(x=Median_Cell_CD8,y=method,color="CD8"),size=size_val,alpha=alpha_val)+
geom_point(aes(x=Median_Cell_COLLAGEN,y=method,color="COLLAGEN"),size=size_val,alpha=alpha_val)+
geom_point(aes(x=Median_Cell_NAKATPASE,y=method,color="NAKATPASE"),size=size_val,alpha=alpha_val)+
geom_point(aes(x=Median_Cell_PANCK,y=method,color="PANCK"),size=size_val,alpha=alpha_val)+
geom_point(aes(x=Median_Cell_VIMENTIN,y=method,color="VIMENTIN"),size=size_val,alpha=alpha_val) +
geom_point(aes(x=Median_Cell_SOX9,y=method,color="SOX9"),size=size_val,alpha=alpha_val) +
geom_point(aes(x=Median_Cell_BCATENIN,y=method,color="BCATENIN"),size=size_val,alpha=alpha_val) +
geom_point(aes(x=Median_Cell_OLFM4,y=method,color="OLFM4"),size=size_val,alpha=alpha_val) +
geom_point(data = mean_values,aes(x=means,y=method,group=method),color="black",fill="white",shape=23,size=3)+
scale_x_continuous("Change in mean misclassification for method, compared to raw data")+
scale_y_discrete("")+
scale_color_discrete("Marker") +
theme(axis.text.y = element_text(hjust = 0))
## --- calculate Otsus on manual labels for CD3 & CD8
cd3s = c(); cd8s = c()
pred_methods = c(methods, "raw")
## calculate Otsu threshold
for(m in pred_methods){
cd3s = c(cd3s,get_otsu_sk(sc[,paste0("Median_Cell_CD3D","_",m)]))
cd8s = c(cd8s,get_otsu_sk(sc[,paste0("Median_Cell_CD8","_",m)]))
}
## generate boolean for marker positive
for(i in 1:length(cd3s)){
m = pred_methods[i]
sc[,paste0("CD3_Otsu_Boolean_",m)] = sc[,paste("Median_Cell_CD3D",m,sep = "_")] > cd3s[i]
sc[,paste0("CD8_Otsu_Boolean_",m)] = sc[,paste("Median_Cell_CD8",m,sep = "_")] > cd8s[i]
}
## turn manual labels into booleans
sc$cd3_bool = ifelse(sc$CD3D=="CD3D+",1,0)
sc$cd8_bool = ifelse(sc$CD8=="CD8+",1,0)
## --- calculate accuracy of maintaining manual labels
cd3_acc = c()
cd8_acc = c()
for(m in pred_methods){
cmat1 = caret::confusionMatrix(as.factor(sc$cd3_bool), as.factor(sc[,paste0("CD3_Otsu_Boolean_",m)]*1))
cd3_acc = c(cd3_acc,cmat1$overall["Accuracy"])
cmat2 = caret::confusionMatrix(as.factor(sc$cd8_bool), as.factor(sc[,paste0("CD8_Otsu_Boolean_",m)]*1))
cd8_acc = c(cd8_acc,cmat2$overall["Accuracy"])
}
## subtract raw from each method
cd3_acc = cd3_acc - cd3_acc[13]
cd8_acc = cd8_acc - cd8_acc[13]
## create plotting dataset
acc_dat = data.frame(c(pred_methods[-13],pred_methods[-13]))
colnames(acc_dat) = c("method")
acc_dat$method = unname(unlist(adjList[acc_dat$method]))
acc_dat$values = c(cd3_acc[-13],cd8_acc[-13])
acc_dat$marker = c(rep("CD3",12),rep("CD8",12))
## fig2b plot
ggplot(acc_dat)+
geom_vline(xintercept = 0)+
geom_point(aes(x=values,y=method,color=marker),size=3) +
scale_x_continuous("Change in marker-positive accuracy, compared to raw data")+
scale_y_discrete("")+
scale_color_manual("Marker",values=c("lightskyblue","chocolate3")) +
theme(axis.text.y = element_text(hjust = 0),legend.position = "right")
## --- --- ---
### --- get baseline values from raw data
## --- --- ---
m = "raw"
## ---- CD3
## modeling
cd3_form = as.formula(paste0("Median_Cell_CD3D_",m,"~epi*lessBroad + (1|SlideID)"))
cd3_mod = lmer(cd3_form,data=sc)
## setup variance vals
varcomp_cd3 = (c(unname(attr(summary(cd3_mod)$varcor$SlideID,"stddev")),
summary(cd3_mod)$sigma))^2
varcomp_cd3 = varcomp_cd3/sum(varcomp_cd3)
svar1 = varcomp_cd3[1]
## ---- CD8
## modeling
cd8_form = as.formula(paste0("Median_Cell_CD8_",m,"~epi*lessBroad + (1|SlideID)"))
cd8_mod = lmer(cd8_form,data=sc)
## setup variance vals
varcomp_cd8 = (c(unname(attr(summary(cd8_mod)$varcor$SlideID,"stddev")),
summary(cd8_mod)$sigma))^2
varcomp_cd8 = varcomp_cd8/sum(varcomp_cd8)
svar2 = varcomp_cd8[1]
## --- --- ---
### --- get variance components from each method
## --- --- ---
cd3_vcs = data.frame()
cd8_vcs = data.frame()
for(m in methods){
## ---- CD3
## modeling
cd3_form = as.formula(paste0("Median_Cell_CD3D_",m,"~epi*lessBroad + (1|SlideID)"))
cd3_mod = lmer(cd3_form,data=sc)
varcomp_cd3 = (c(unname(attr(summary(cd3_mod)$varcor$SlideID,"stddev")),
summary(cd3_mod)$sigma))^2
## setup variance vals
varcomp_cd3 = varcomp_cd3/sum(varcomp_cd3)
varcomp_cd3 = data.frame(varcomp_cd3)
## data setup
colnames(varcomp_cd3)[1] = c("variance")
varcomp_cd3$level = c("Slide","Residual")
varcomp_cd3$method = m
cd3_vcs = rbind(cd3_vcs,varcomp_cd3)
## ---- CD8
## modeling
cd8_form = as.formula(paste0("Median_Cell_CD8_",m,"~epi*lessBroad + (1|SlideID)"))
cd8_mod = lmer(cd8_form,data=sc)
varcomp_cd8 = (c(unname(attr(summary(cd8_mod)$varcor$SlideID,"stddev")),
summary(cd8_mod)$sigma))^2
## setup variance vals
varcomp_cd8 = varcomp_cd8/sum(varcomp_cd8)
varcomp_cd8 = data.frame(varcomp_cd8)
## data setup
colnames(varcomp_cd8)[1] = c("variance")
varcomp_cd8$level = c("Slide","Residual")
varcomp_cd8$method = m
cd8_vcs = rbind(cd8_vcs,varcomp_cd8)
}
#cd3_vcs = cd3_vcs[!(cd3_vcs$method %in% not_cols),]
#cd8_vcs = cd8_vcs[!(cd8_vcs$method %in% not_cols),]
## change values
cd3_vcs$method = unname(unlist(adjList[cd3_vcs$method]))
cd8_vcs$method = unname(unlist(adjList[cd8_vcs$method]))
## generate plots
gcd3 = ggplot(cd3_vcs) +
geom_col(aes(y=method,x=variance,fill=level)) +
theme(axis.text.y = element_text(hjust = 1)) +
scale_fill_manual("Level",values=rev(viridis::magma(2)))+
geom_vline(xintercept=svar1) +
scale_x_continuous("Proportion of variance")+
scale_y_discrete("")+
ggtitle("CD3") +
theme(axis.text.y = element_text(size = 10))
gcd8 = ggplot(cd8_vcs) +
geom_col(aes(y=method,x=variance,fill=level)) +
theme(axis.text.y = element_text(hjust = 1)) +
scale_fill_manual("Level",values=rev(viridis::magma(2)))+
geom_vline(xintercept=svar2) +
scale_x_continuous("Proportion of variance")+
scale_y_discrete("")+
ggtitle("CD8") +
theme(axis.text.y = element_text(size = 10))
## fig3 plot
ggpubr::ggarrange(gcd3,gcd8,nrow=2,ncol=1,legend = "bottom",common.legend = TRUE)
tts = read.csv('/media/disk2/atlas_mxif/data/TissueSubTypes Colon Map.csv')
## --- match capitalization and types, combine data
names(tts)[1] = 'TissueID'
tts$adenSSLcrc = ifelse(tts$tissueSubType %in% c('Tub', 'TV'), 'AD',
ifelse(tts$tissueSubType %in% c('HP', 'HP/SSL', 'SSL', 'Tub/SSL'), 'SSL',
ifelse(tts$tissueSubType %in% c('CRC'), 'CRC', NA )) )
tts$SlideID = sc$SlideID[match(tts$TissueID, sc$TissueID)]
tts = tts[,c('SlideID', 'adenSSLcrc')]
sc = merge(sc, tts, all.x=TRUE)
sc = sc[ sc$lessBroad %in% c('AD', 'SSL', 'HP'),]
sc = sc[ sc$Tumor>0,]
#sc$cd3_bool = ifelse(sc$CD3D=="CD3D+",1,0)
#sc$cd8_bool = ifelse(sc$CD8=="CD8+",1,0)
## --- group data
small_sc= sc %>%
group_by(Slide_Region,epi,SlideID,lessBroad,.add=TRUE) %>%
summarise_at(colnames(sc)[c(which(grepl("Otsu_Boolean",colnames(sc))),which(colnames(sc) %in% c("cd3_bool","cd8_bool")))],mean)
small_sc$epi= ifelse(small_sc$epi == 1,"Epithelium","Stroma")
# methods = c("log10","mean_division","mean_division_log10","cube_root","fifth_root","raw")
# methods = c(methods,paste0(methods,"_fda_registered_x1"))
# methods = c(methods,paste0(methods[1:6],"_ComBat_incl_zeroes"))
# not_cols = c("fifth_root","fifth_root_ComBat_incl_zeroes","fifth_root_fda_registered_x1",
# "raw","raw_ComBat_incl_zeroes",
# "raw_fda_registered_x1")
# methods = methods[!(methods %in% not_cols)]
## --- setup variables for analysis
methods = sort(methods)[c(4:6,1:3,7:9,10:12)]
methods_bool = paste0("Otsu_Boolean_",methods)
CD3_vars = paste("CD3", methods_bool, sep="_" )
CD8_vars = paste("CD8", methods_bool, sep="_" )
CD3_varcomps = data.frame()
CD8_varcomps = data.frame()
run_var_comps = function(v){
## --- generate model and fits
formula_v = as.formula(paste0(v,"~epi*lessBroad+(1|SlideID)"))
mod_v = lmer(formula_v,data=small_sc)
CIs_v = emmeans(mod_v, as.formula(paste('~ 1 | ', "lessBroad", ' + epi')), type='response')
fits_v = as.data.frame(summary(CIs_v))
## --- calc varcomps
varcomp_v = (c(unname(attr(summary(mod_v)$varcor$SlideID,"stddev")),
summary(mod_v)$sigma))^2
varcomp_v = data.frame(varcomp_v)
varcomp_v$level = c("Slide","Cell (Residual)")
varcomp_v$method = stringr::str_replace(v,"CD3_Otsu_Boolean_","")
colnames(varcomp_v)[1] = c("Variance")
return(list("varcomps"=varcomp_v,
"fits"=fits_v))
}
get_cp_plot = function(v,fits,m_v,marker="CD3",dat=small_sc){
ggplot(dat, aes(x=as.factor(epi),y=get(v),color=as.factor(epi),fill=as.factor(epi)))+
gghalves::geom_half_point(side = "l",range_scale = .4, alpha = .5) +
geom_boxplot(width=0.1,alpha=0.5,outlier.shape = NA) +
## random effects estimated means, not included.
# geom_jitter(data=fits[fits$lessBroad=="AD",],aes(x=as.factor(epi),y=emmean),size=2,width=0.05,shape=23,fill="lightblue4") +
# geom_jitter(data=fits[fits$lessBroad=="HP",],aes(x=as.factor(epi),y=emmean),size=2,width=0.05,shape=23,fill="midnightblue") +
# geom_jitter(data=fits[fits$lessBroad=="SSL",],aes(x=as.factor(epi),y=emmean), size=2,width=0.05,shape=23,fill="lightslateblue") +
ggdist::stat_halfeye(adjust = .85, width = .5, .width = 0, justification = -.25, point_colour = NA,alpha=0.5) +
scale_x_discrete(m_v) +
scale_y_continuous(paste0(marker," Proportions"),limits=c(-0.05,1.05))+
scale_fill_manual(values=c("lightslateblue","coral1"))+
scale_color_manual(values=c("lightslateblue","coral1"))+
theme(legend.position = "None")
#scale_fill_manual("Tumor Type",values=viridis::magma(10)[c(4,7,10)])
}
## CD3 base
vb1 = "cd3_bool"
vbobj1 = run_var_comps(v = vb1)
b1 = get_cp_plot(vb1, vbobj1$fits,m_v="CD3 Ground Truth Label",marker="CD3")
## CD8 base
vb2 = "cd8_bool"
vbobj2 = run_var_comps(v = vb2)
b2 = get_cp_plot(vb2, vbobj2$fits,m_v="CD8 Ground Truth Label",marker="CD8")
## fig4a plot
bfig = ggpubr::ggarrange(b1,b2,common.legend = TRUE,legend = "none")
ggpubr::annotate_figure(bfig, top=text_grob("Baseline proportions in CD3 and CD8 using manual labels"))
v1 = CD3_vars[1]
vobj1 = run_var_comps(v = v1)
g1 = get_cp_plot(v1, vobj1$fits,m_v="Unnormalized: log10",marker="CD3")
v2 = CD3_vars[2]
vobj2 = run_var_comps(v = v2)
g2 = get_cp_plot(v2, vobj2$fits,m_v="ComBat: log10",marker="CD3")
v3 = CD3_vars[3]
vobj3 = run_var_comps(v = v3)
g3 = get_cp_plot(v3, vobj3$fits,m_v="fda: log10",marker="CD3")
v4 = CD3_vars[4]
vobj4 = run_var_comps(v = v4)
g4 = get_cp_plot(v4, vobj4$fits,m_v="Unnormalized: Cube root",marker="CD3")
v5 = CD3_vars[5]
vobj5 = run_var_comps(v = v5)
g5 = get_cp_plot(v5, vobj5$fits,m_v="ComBat: Cube root",marker="CD3")
v6 = CD3_vars[6]
vobj6 = run_var_comps(v = v6)
g6 = get_cp_plot(v6, vobj6$fits,m_v="fda: Cube root",marker="CD3")
v7 = CD3_vars[7]
vobj7 = run_var_comps(v = v7)
g7 = get_cp_plot(v7, vobj7$fits,m_v="Unnormalized: Mean division",marker="CD3")
v8 = CD3_vars[8]
vobj8 = run_var_comps(v = v8)
g8 = get_cp_plot(v8, vobj8$fits,m_v="ComBat: Mean division",marker="CD3")
v9 = CD3_vars[9]
vobj9 = run_var_comps(v = v9)
g9 = get_cp_plot(v9, vobj9$fits,m_v="fda: Mean division",marker="CD3")
v10 = CD3_vars[10]
vobj10 = run_var_comps(v = v10)
g10 = get_cp_plot(v10, vobj10$fits,m_v="Unnormalized: Mean division log10",marker="CD3")
v11 = CD3_vars[11]
vobj11 = run_var_comps(v = v11)
g11 = get_cp_plot(v11, vobj11$fits,m_v="ComBat: Mean division log10",marker="CD3")
v12 = CD3_vars[12]
vobj12 = run_var_comps(v = v12)
g12 = get_cp_plot(v12, vobj12$fits,m_v="fda: Mean division log10",marker="CD3")
## fig4b plot
cd3fig = ggpubr::ggarrange(plotlist=list(g1,g2,g3,
g4,g5,g6,
g7,g8,g9,
g10,g11,g12),common.legend = TRUE,legend = "none", ncol = 3,nrow=4)
ggpubr::annotate_figure(cd3fig, top=text_grob("Proportions of CD3 by method"))
v1 = CD8_vars[1]
vobj1 = run_var_comps(v = v1)
g1 = get_cp_plot(v1, vobj1$fits,m_v="Unnormalized: log10",marker="CD8")
v2 = CD8_vars[2]
vobj2 = run_var_comps(v = v2)
g2 = get_cp_plot(v2, vobj2$fits,m_v="ComBat: log10",marker="CD8")
v3 = CD8_vars[3]
vobj3 = run_var_comps(v = v3)
g3 = get_cp_plot(v3, vobj3$fits,m_v="fda: log10",marker="CD8")
v4 = CD8_vars[4]
vobj4 = run_var_comps(v = v4)
g4 = get_cp_plot(v4, vobj4$fits,m_v="Unnormalized: Cube root",marker="CD8")
v5 = CD8_vars[5]
vobj5 = run_var_comps(v = v5)
g5 = get_cp_plot(v5, vobj5$fits,m_v="ComBat: Cube root",marker="CD8")
v6 = CD8_vars[6]
vobj6 = run_var_comps(v = v6)
g6 = get_cp_plot(v6, vobj6$fits,m_v="fda: Cube root",marker="CD8")
v7 = CD8_vars[7]
vobj7 = run_var_comps(v = v7)
g7 = get_cp_plot(v7, vobj7$fits,m_v="Unnormalized: Mean division",marker="CD8")
v8 = CD8_vars[8]
vobj8 = run_var_comps(v = v8)
g8 = get_cp_plot(v8, vobj8$fits,m_v="ComBat: Mean division",marker="CD8")
v9 = CD8_vars[9]
vobj9 = run_var_comps(v = v9)
g9 = get_cp_plot(v9, vobj9$fits,m_v="fda: Mean division",marker="CD8")
v10 = CD8_vars[10]
vobj10 = run_var_comps(v = v10)
g10 = get_cp_plot(v10, vobj10$fits,m_v="Unnormalized: Mean division log10",marker="CD8")
v11 = CD8_vars[11]
vobj11 = run_var_comps(v = v11)
g11 = get_cp_plot(v11, vobj11$fits,m_v="ComBat: Mean division log10",marker="CD8")
v12 = CD8_vars[12]
vobj12 = run_var_comps(v = v12)
g12 = get_cp_plot(v12, vobj12$fits,m_v="fda: Mean division log10",marker="CD8")
## fig4c plot
cd8fig = ggpubr::ggarrange(plotlist=list(g1,g2,g3,
g4,g5,g6,
g7,g8,g9,
g10,g11,g12),common.legend = TRUE,legend = "none", ncol = 3,nrow=4)
ggpubr::annotate_figure(cd8fig, top=text_grob("Proportions of CD8 by method"))
Note that the UMAP algorithm is stochastic, and because of the data size, we are downsampling so these plots may not be identical to those in the paper.
## --- Multivariate clustering
## create downsampled data
scs = split(sc,sc$SlideID)
ds_scs_list = lapply(X=1:length(scs),FUN = function(i){
nr=nrow(scs[[i]])
scs[[i]][sample(x = 1:nr,size = round(0.1 * nr)),]
})
## set vars for clustering
dsd = data.frame(data.table::rbindlist(ds_scs_list))
epi_vars = all_vars[3:6]
all_dfs = data.frame()
## run umap on all methods
for(m in methods){
## get umap loadings
u1 = tumap(dsd[,paste0(epi_vars,"_",m)])
df_add = data.frame(u1[,1],u1[,2])
colnames(df_add) = c("U1","U2")
## add relevant columns
df_add$method = m
df_add$epi = dsd$epi
df_add$tumor = dsd$Tumor
df_add$SlideID = dsd$SlideID
all_dfs = rbind(df_add,all_dfs)
}
m = "raw"
u1 = tumap(dsd[,paste0(epi_vars,"_",m)])
df_add = data.frame(u1[,1],u1[,2])
colnames(df_add) = c("U1","U2")
## add relevant columns
df_add$method = m
df_add$epi = dsd$epi
df_add$tumor = dsd$Tumor
df_add$SlideID = dsd$SlideID
## tissue class plot
e1 = ggplot(df_add) +
geom_point(aes(x=U1,y=U2,color=factor(epi)),size=0.01,alpha=0.05) +
scale_x_continuous("Raw (by Tissue Class)") +
scale_y_continuous("") +
scale_color_manual("Tissue type",labels=c("Stroma","Epithelium"),values=c("tan1","skyblue4")) +
theme_minimal() +
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(size=2)))
## slide plot
s1 = ggplot(df_add) +
geom_point(aes(x=U1,y=U2,color=factor(SlideID)),size=0.01,alpha=0.05) +
scale_x_continuous("Raw (by Slide)") +
scale_y_continuous("") +
theme_minimal() +
theme(legend.position = "None")
## fig5ab plot
ggpubr::ggarrange(e1,s1)
msplit = split(all_dfs,all_dfs$method)
msplit = msplit[c(4,5,6,1,2,3,7,8,9,10,11,12)]
#msplit = append(msplit,values="blank",after=8)
new_labs = c("Unnormalized: log10",
"ComBat: log10",
"fda: log10",
"Unnormalized: Cube root",
"ComBat: Cube root",
"fda: Cube root",
"Unnormalized: Mean division",
"ComBat: Mean division",
"fda: Mean division",
"Unnormalized: Mean division log10",
"ComBat: Mean division log10",
"fda: Mean division log10")
## 4x3 figure of each method UMAP
plist = lapply(X=1:length(msplit),FUN=function(i){
m = msplit[[i]]
ggplot(m) +
geom_point(aes(x=U1,y=U2,color=factor(epi)),size=0.01,alpha=0.35) +
scale_x_continuous(paste0(new_labs[i],""))+
scale_y_continuous("") +
scale_color_manual("Tissue type",labels=c("Stroma","Epithelium"),values=c("tan1","skyblue4")) +
theme_minimal() +
guides(colour = guide_legend(override.aes = list(size=2)))
})
## fig5c plot
ggpubr::ggarrange(plotlist=plist,legend = "bottom",common.legend = TRUE,nrow=4,ncol=3)
## 4x3 figure of each method UMAP
plist = lapply(X=1:length(msplit),FUN=function(i){
m = msplit[[i]]
ggplot(m) +
geom_point(aes(x=U1,y=U2,color=factor(SlideID)),size=0.01,alpha=0.05) +
scale_x_continuous(paste0(new_labs[i]," (by Slide)"))+
scale_y_continuous("") +
#scale_color_manual(values=vals) +
theme_minimal() +
theme(legend.position = "None")
# }
})
## fig5d plot
ggpubr::ggarrange(plotlist=plist,nrow=4,ncol=3)
R Version infosessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] splines stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] reticulate_1.20 uwot_0.1.10 emmeans_1.5.4 lme4_1.1-26
## [5] umap_0.2.7.0 fda_5.1.9 fds_1.8 RCurl_1.98-1.2
## [9] rainbow_3.6 pcaPP_1.9-73 MASS_7.3-53 Matrix_1.2-18
## [13] viridis_0.5.1 viridisLite_0.3.0 ggforce_0.3.3 ggpubr_0.4.0
## [17] ggplot2_3.3.3 tidyr_1.1.3 dplyr_1.0.5
##
## loaded via a namespace (and not attached):
## [1] minqa_1.2.4 colorspace_2.0-0 ggsignif_0.6.1
## [4] ellipsis_0.3.1 class_7.3-17 rio_0.5.26
## [7] mclust_5.4.7 estimability_1.3 farver_2.1.0
## [10] lubridate_1.7.10 prodlim_2019.11.13 RSpectra_0.16-0
## [13] fansi_0.4.2 mvtnorm_1.1-1 codetools_0.2-16
## [16] knitr_1.31 polyclip_1.10-0 jsonlite_1.7.2
## [19] nloptr_1.2.2.2 pROC_1.17.0.1 pbkrtest_0.5-0.1
## [22] caret_6.0-86 broom_0.7.5 cluster_2.1.0
## [25] png_0.1-7 ggdist_2.4.1 compiler_4.0.3
## [28] backports_1.2.1 tweenr_1.0.2 htmltools_0.5.1.1
## [31] tools_4.0.3 coda_0.19-4 gtable_0.3.0
## [34] glue_1.4.2 reshape2_1.4.4 rappdirs_0.3.3
## [37] Rcpp_1.0.6 carData_3.0-4 cellranger_1.1.0
## [40] jquerylib_0.1.3 vctrs_0.3.6 nlme_3.1-149
## [43] gghalves_0.1.1 iterators_1.0.13 timeDate_3043.102
## [46] xfun_0.21 gower_0.2.2 stringr_1.4.0
## [49] openxlsx_4.2.3 lifecycle_1.0.0 statmod_1.4.35
## [52] rstatix_0.7.0 scales_1.1.1 ipred_0.9-10
## [55] hms_1.0.0 parallel_4.0.3 yaml_2.2.1
## [58] curl_4.3 gridExtra_2.3 sass_0.3.1
## [61] rpart_4.1-15 stringi_1.5.3 highr_0.8
## [64] foreach_1.5.1 e1071_1.7-4 boot_1.3-25
## [67] zip_2.1.1 lava_1.6.8.1 hdrcde_3.4
## [70] atlasAnalysis_0.0.1 rlang_0.4.10 pkgconfig_2.0.3
## [73] bitops_1.0-6 distributional_0.2.2 evaluate_0.14
## [76] lattice_0.20-41 purrr_0.3.4 labeling_0.4.2
## [79] ks_1.12.0 recipes_0.1.15 cowplot_1.1.1
## [82] tidyselect_1.1.0 RcppAnnoy_0.0.18 plyr_1.8.6
## [85] magrittr_2.0.1 R6_2.5.0 generics_0.1.0
## [88] combinat_0.0-8 pillar_1.5.1 haven_2.3.1
## [91] foreign_0.8-79 withr_2.4.1 nnet_7.3-14
## [94] survival_3.2-7 abind_1.4-5 tibble_3.1.0
## [97] crayon_1.4.1 car_3.0-10 KernSmooth_2.23-17
## [100] utf8_1.1.4 rmarkdown_2.7 grid_4.0.3
## [103] readxl_1.3.1 data.table_1.14.0 ModelMetrics_1.2.2.2
## [106] forcats_0.5.1 digest_0.6.27 xtable_1.8-4
## [109] stats4_4.0.3 openssl_1.4.3 munsell_0.5.0
## [112] bslib_0.2.4 askpass_1.1